##########################################################################
######### Code for Figures 1-2 and Appendix L

# Main text and online appendix
# 1. Getting data for Google Trends and Fact-Checking
# 3. Figure 1: Google Trends
# 4. Figure 2: Fact-Checking Information

# Appendix L social media data on news coverage
# Figures L1 and L2
# Tables L1 to L3
# Data cited in the Appendix

##########################################################################

rm(list=ls())
options(scipen = 999) #disabling sci notation

library(stringr)
library(stringi)
library(tidyverse)
library(foreign)
library(stopwords)
library(lubridate)
library(scales)
library(readxl)
library(zoo)
library(xtable)
library(Cairo)
library(ggplot2)

#Set working directory
setwd("") #set working directory

################################################################################
#################### Searches on Google Trends (Figure 1)
################################################################################

br <- read_csv("Data/gtrends/Brazil.csv", 
               skip = 3, col_names = c("month", "fake_news")) 
br <- br %>% mutate(month = as.Date(paste(month,"-01",sep = "")), 
                    fake_news = ifelse(fake_news == "<1", 0.5, fake_news), 
                    fake_news = as.numeric(fake_news))


fig1a <- ggplot(br %>% filter(month > as.Date("2010-01-01"))) + 
    geom_col(aes(x = month, y = fake_news), 
                   fill = "black", alpha = 0.8) +
    theme_bw() +
    labs(x = "Month", y = "Google Trends") +
    scale_x_date(labels = date_format("%m-%Y")) +
    geom_vline(xintercept = as.numeric(br$month[178]), linetype="longdash", 
               color = "gray", size=0.8)

cairo_ps(filename = "Figures/fig1a.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(fig1a)
dev.off()

kg <- read_csv("Data/gtrends/Brazil_Kitgay_2018.csv", 
                 skip = 3, col_names = c("date", "kit_gay")) 
kg <- kg %>% mutate(week = week(date), 
                    kit_gay = ifelse(kit_gay == "<1", 0.5, kit_gay), 
                    kit_gay = as.numeric(kit_gay))

fig1b <- ggplot(kg) + 
  geom_col(aes(x = week, y = kit_gay), 
                 fill = "black", alpha = 0.8) +
  theme_bw() +
  labs(x = "Weeks in 2018", y = "Google Trends") +
  geom_vline(xintercept = c(as.numeric(kg$week[34]), as.numeric(kg$week[40])), linetype=c("dotted", "longdash"), 
             color = c("gray", "gray"), size=c(.8, .8))

cairo_ps(filename = "Figures/fig1b.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(fig1b)
dev.off()


##########################################################
#Fact-checking descriptive (Figure 2)
##########################################################

#Original scraped dataset
#The complete dataset (with all the variables) is not being shared because it is being used in other
#projects in preparation. Therefore, the commented out code is not replicable. 
#We are sharing all the variables used in this study. 
# dataset_news <-  readr::read_csv("Data/fact_checking_data/dataset_final_unico.csv", 
#                                  col_types = cols(tags = col_character(), url = col_character(), 
#                                                   text_factcheck = col_character())) %>% select(website, date_published, tags, classification)
# saveRDS(dataset_news, file = "Data/fact_checking_data/dataset_news.Rds")

dataset_news <- read_rds("Data/fact_checking_data/dataset_news.Rds")

dataset_news <- dataset_news %>% mutate(year_month = zoo::as.yearmon(lubridate::dmy(date_published)), 
                                        dates2 = as.character(year_month),
                                        election_time = ifelse(dates2 %in% c("Aug 2018", "Sep 2018", "Oct 2018"), 1, 0))

election_dataset_boatos <- dataset_news %>% filter(election_time == 1, 
                                                   website == "boatos_org") 
election_dataset_fatooufake <- dataset_news %>% filter(election_time == 1, 
                                                       website == "fato_ou_fake")
election_dataset_aosfatos <- dataset_news %>% filter(election_time == 1, 
                                                     website == "aosfatos.org")
election_dataset_lupa <- dataset_news %>% filter(election_time == 1, 
                                                 website == "lupa")

election_dataset_boatos_politics <- election_dataset_boatos %>% filter(tags %in% c("Política")) #Lista política are compilations

#Boatos data for manual checking
hand <- read_excel("Data/fact_checking_data/PlanilhaBoatosFC_public.xlsx", sheet = 1)
table(hand$FACT_CHECK)
#There are 194 false political news here.

# Identifying stories in August, September, and October
plt_dummy = dataset_news %>% 
  mutate(year_month = zoo::as.yearmon(lubridate::dmy(date_published))) %>% 
  group_by(year_month, website) %>% 
  summarise(count = n(), year = lubridate::year(year_month)) %>% 
  filter(year >= 2010) %>% 
  select(-year) %>% 
  mutate(year_month = as.Date(year_month),
         month_aso = lubridate::month(year_month),
         month_aso = if_else(month_aso %in% c(8,9,10, 11), 1, 0)) %>% 
  filter(month_aso == 1) %>%
  mutate(year = year(year_month)) %>% 
  group_by(website, year) %>%
  summarise(xmin = min(year_month), 
            xmax = max(year_month)) %>% 
  mutate(ymin = -Inf, ymax= Inf) %>% 
  unique() %>% 
  filter(year == 2018)

#Political Fake Stories
fig2b <- dataset_news %>% 
  mutate(classification = toupper(classification)) %>%
  filter(website == "boatos_org") %>% 
  filter(grepl(x = tags, pattern = "(P|p)ol(í|i)tica"),
         classification == "FALSO") %>% 
  filter(tags != "Lista Política") %>% #removing Lista because these seem to be primarily compilations
  mutate(year_month = zoo::as.yearmon(lubridate::dmy(date_published))) %>% 
  group_by(year_month, website) %>% 
  summarise(count = n(), 
            year = lubridate::year(year_month)) %>% 
  filter(year >= 2010) %>% 
  select(-year) %>% 
  mutate(year_month = as.Date(year_month),
         month_aso = lubridate::month(year_month),
         month_aso = if_else(month_aso %in% c(8,9,10,11), 1, 0)) %>% 
  ggplot() +
  geom_line(aes(x = year_month, 
                y= count, 
                color = website)) +
  labs(color = "Fact-Checking Website",
       x = "Month-Year (2010-2020)",
       y = "Frequency") +
  scale_x_date(date_breaks = "1 year",
               date_labels = "%Y") +
  geom_rect(data= plt_dummy %>% filter(website == "boatos_org"), 
            aes(xmin = as.Date(xmin), 
                xmax = as.Date(xmax),
                ymin = ymin, 
                ymax = ymax), 
            alpha = 0.4) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold"),
        legend.text = element_text(face = "bold"),
        axis.text = element_text(face= "bold"),
        axis.title = element_text(face= "bold"),
        legend.position = "none") +
  scale_color_manual(values =c("#000000"))

cairo_ps(filename = "Figures/fig2b.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(fig2b)
dev.off()

fig2a <- dataset_news %>% 
  mutate(year_month = zoo::as.yearmon(lubridate::dmy(date_published))) %>% 
  group_by(year_month) %>% 
  summarise(count = n(), 
            year = lubridate::year(year_month)) %>% 
  filter(year >= 2010) %>% 
  select(-year) %>% 
  mutate(year_month = as.Date(year_month),
         month_aso = lubridate::month(year_month),
         month_aso = if_else(month_aso %in% c(8,9,10,11), 1, 0)) %>% 
  mutate(aggregation_per_month = zoo::as.yearmon(year_month)) %>%
  ungroup() %>% 
  distinct() %>% 
  ggplot() +
  geom_area(aes(x=year_month, y= count), fill = "black") +
  scale_x_date(date_breaks = "1 year",
               date_labels = "%Y") +
  geom_rect(data= plt_dummy %>% filter(year == 2018) %>% ungroup() %>% head(1), 
            aes(xmin = as.Date(xmin), 
                xmax = as.Date(xmax),
                ymin = ymin, 
                ymax = ymax), 
            alpha = 0.5) +
  theme_minimal(base_size = 14) +
  labs(color = "Fact-Checking Websites",
       x = "Month-Year (2010-2020)",
       y = "Frequency") +   
  theme(plot.title = element_text(face = "bold"),
        legend.text = element_text(face = "bold"),
        axis.text = element_text(face= "bold"),
        axis.title = element_text(face= "bold"))

cairo_ps(filename = "Figures/fig2a.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(fig2a)
dev.off()

################################################################################
#################### Media data on news coverage in 2018 (Appendix L Figures)
################################################################################

# #Data from CrowdTangle, a public insights tool owned and operated by Facebook.
# #Citation: CrowdTangle Team (2021). CrowdTangle. Facebook, Menlo Park, California, United States. List ID:1455754
# #Data from CrowdTangle cannot be made shared publicly by researchers. However, the provided list ID should allow
# #for interested readers to learn more details about our data collection. The commented out code cannot be
# #directly verified because the raw data cannot be shared.
# 
# media18 <- read_csv("Data/crowdtangle-news/2018-historic-news.csv",
#                     col_types = cols(`Sponsor Id` = col_character(), `Sponsor Name` = col_character(),
#                     `Overperforming Score` = col_number()))
# 
# #Sanity check
# media18 <- media18 %>% mutate(date = ymd_hms(Created),
#                               year = year(date)) %>% filter(year == 2018)
# table(media18$year)
# 
# 
# # Data cleaning
# 
# #load list of stopwords
# stopwords_regex <- paste0('\\b', paste(rev(stopwords('pt')), collapse = '\\b|\\b'), '\\b')
# 
# clean_text <- function(data_column) {
#   # make text lower case
#   data_column = str_to_lower(data_column)
#   # remove non-alphanumeric symbols
#   data_column = str_replace_all(data_column, "[^[:alnum:]]", " ")
#   #remove portuguese stopwords like de, do and da
#   data_column = str_replace_all(data_column, stopwords_regex, " ")
#   #remove accents
#   data_column = stri_trans_general(str = data_column, id = "Latin-ASCII")
#   # collapse multiple spaces
#   str_replace_all(data_column, "\\s+", " ")
# }
# 
# #create new columns with cleaned text
# media18 <- media18 %>%
#            mutate(message_clean = clean_text(Message))
# 
# #create list of political terms
# all_terms <- c("fake", "noticia falsa", "fake news", "kit gay", "mamadeira piroca", "fraude urnas",
#                "haddad", "lula", "bolsonaro", "corrupcao", "crime", "emprego", "economia")
# 
# #create empty list to collect data
# str_exact_dat <- list()
# 
# #for loop to identify all of the times that the news stories include political terms
# for(i in 1:nrow(media18)){
#   str_exact_dat[[i]] <- str_extract(media18$message_clean[i], all_terms)
# }
# 
# #combine list into dataframe
# final <- do.call(rbind.data.frame, str_exact_dat)
# #rename columns
# colnames(final) <- paste0("match", 1:ncol(final), "_extract")
# final_match_fake <-  final
# 
# #if a fake news story, has no term matches, fill NA
# final_match_fake[final_match_fake==""] <- NA
# 
# #combine original data and political term matches
# final_df <- cbind.data.frame(media18, final_match_fake)
# final_df <- final_df %>% rename(fake = match1_extract,
#                                 noticia_falsa = match2_extract,
#                                 fake_news = match3_extract,
#                                 kit_gay = match4_extract,
#                                 mamadeira_piroca = match5_extract,
#                                 fraude_urnas = match6_extract,
#                                 haddad = match7_extract,
#                                 lula = match8_extract,
#                                 bolsonaro = match9_extract,
#                                 corrupcao = match10_extract,
#                                 crime = match11_extract,
#                                 emprego = match12_extract,
#                                 economia = match13_extract)
# 
# #write.csv(final_df, "Data/crowdtangle-news/analysis-2018.csv")
# 
# #Recode
# final_df <- final_df %>% mutate(type = case_when(noticia_falsa == "noticia falsa" |
#                                 fake_news == "fake news" |
#                                 kit_gay == "kit gay" | mamadeira_piroca == "mamadeira piroca" ~ "fake term_limited",
#                                 lula == "lula" | haddad == "haddad" | bolsonaro == "bolsonaro" ~ "candidates",
#                                 corrupcao == "corrupcao" | crime == "crime" | emprego == "emprego" |
#                                 economia == "economia" ~ "issues" ),
#                                 dates = ymd_hms(Created),
#                                 dates2 = as.Date(dates),
#                                 month = month(dates),
#                                 weeks = week(dates))
# 
# final_df %>% group_by(type) %>% summarise(avg_over = mean(`Overperforming Score`, na.rm = T),
#                                           avg_comments = mean(Comments, na.rm = T),
#                                           avg_likes = mean(Likes, na.rm = T),
#                                           avg_shares = mean(Shares, na.rm = T),
#                                           avg_Angry = mean(Angry, na.rm = T))
# 
# # #Manual sample for checking fake
# # set.seed(30030)
# # sample_fn <- final_df %>% filter(type == "fake term_limited" & month %in% c(8, 9, 10)) %>%
# #              sample_n(100)
# #
# # #write.csv(sample_fn, file = "Data/crowdtangle-news/FN-ManualChecking.csv")
# #
# # #Manual sample for checking issues
# # set.seed(30030)
# # sample_is <- final_df %>% filter(type == "issues" & month %in% c(8, 9, 10)) %>%
# #              sample_n(100)
# 
# #write.csv(sample_is, file = "Data/crowdtangle-news/Issues-ManualChecking.csv")
# 
# #### Descriptive analysis
# 
# month_faketerm <- final_df %>%
#   group_by(type, month) %>% summarise(count = n(),
#                                       avg_comments = mean(Comments, na.rm = T),
#                                       avg_shares =  mean(Shares, na.rm = T),
#                                       avg_performance = mean(`Overperforming Score`, na.rm = T)) %>%
#   filter(type == "fake term_limited")
# 
# week_faketerm <- final_df %>%
#   group_by(type, weeks, month) %>% summarise(count = n(),
#                                              avg_comments = mean(Comments, na.rm = T),
#                                              avg_shares =  mean(Shares, na.rm = T),
#                                              avg_performance = mean(`Overperforming Score`, na.rm = T)) %>%
#   filter(type == "fake term_limited" & month %in% c(8, 9, 10))
# 
# 
# #Issues
# 
# month_issues <- final_df %>%
#   group_by(type, month) %>% summarise(count = n(),
#                                       avg_comments = mean(Comments, na.rm = T),
#                                       avg_shares =  mean(Shares, na.rm = T),
#                                       avg_performance = mean(`Overperforming Score`, na.rm = T)) %>% filter(type == "issues")
# 
# week_issues <- final_df %>%
#   group_by(type, weeks, month) %>% summarise(count = n(),
#                                              avg_comments = mean(Comments, na.rm = T),
#                                              avg_shares =  mean(Shares, na.rm = T),
#                                              avg_performance = mean(`Overperforming Score`, na.rm = T)) %>%
#   filter(type == "issues" & month %in% c(8, 9, 10))
# 
# month_issues_economia <- final_df %>% mutate(economia = as.character(economia)) %>%
#   group_by(economia, month) %>% summarise(count = n(),
#                                       avg_comments = mean(Comments, na.rm = T),
#                                       avg_shares =  mean(Shares, na.rm = T),
#                                       avg_performance = mean(`Overperforming Score`, na.rm = T)) %>%
#   filter(economia == "economia")
# 
# 
# week_issues_economia <- final_df %>% mutate(economia = as.character(economia)) %>%
#   group_by(economia, weeks, month) %>% summarise(count = n(),
#                                                  avg_comments = mean(Comments, na.rm = T),
#                                                  avg_shares =  mean(Shares, na.rm = T),
#                                                  avg_performance = mean(`Overperforming Score`, na.rm = T)) %>%
#   filter(economia == "economia" & month %in% c(8, 9, 10))
# 
# 
# all <- bind_rows(week_faketerm, week_issues)
# 
# save(list = c("month_issues", "month_faketerm",
#               "month_issues_economia", "all", "week_issues_economia"), file = "Data/crowdtangle-news/SocialMedia.Rdata")
# 

load("Data/crowdtangle-news/SocialMedia.Rdata")

#Overperformance

#Table L2
print.xtable(xtable(month_issues %>% select(month, avg_performance)), type = "latex", file = "Tables/L2.tex")
#Table L1
print.xtable(xtable(month_faketerm %>% select(month, avg_performance)), type = "latex", file = "Tables/L1.tex")
#Table L3
print.xtable(xtable(month_issues_economia %>% select(month, avg_performance)), type = "latex", file = "Tables/L3.tex")

#Counts
figl1a <- ggplot(all %>% filter(type == "fake term_limited")) + 
  geom_col(aes(x = weeks, y = count),
                 fill = "black", alpha = 0.6) +
  theme_bw() +
  labs(x = "Weeks in the Three Months before 2018 Election", y = "Counts of Coverage Fake News") + 
  geom_vline(xintercept = c(40, 43), linetype=c("dotted", "longdash"), 
             color = c("gray", "gray"), size=c(.5, .5))

cairo_ps(filename = "Figures/figl1a.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(figl1a)
dev.off()


figl1b <- ggplot(all %>% filter(type == "issues")) + 
  geom_col(aes(x = weeks, y = count),  
                 fill = "black", alpha = 0.6) +
  theme_bw() +
  labs(x = "Weeks in the Three Months before 2018 Election", 
       y = "Counts of Coverage of Issues") + 
  geom_vline(xintercept = c(40, 43), linetype=c("dotted", "longdash"), 
             color = c("gray", "gray"), size=c(.5, .5))

cairo_ps(filename = "Figures/figl1b.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(figl1b)
dev.off()

figl1c <- ggplot(week_issues_economia)  + 
  geom_col(aes(x = weeks, y = count), 
           fill = "black", alpha = 0.6) +
  theme_bw() +
  labs(x = "Weeks in the Three Months before 2018 Election", 
       y = "Counts of Coverage of Economic Issues") + 
  geom_vline(xintercept = c(40, 43), linetype=c("dotted", "longdash"), 
             color = c("gray", "gray"), size=c(.5, .5))

cairo_ps(filename = "Figures/figl1c.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(figl1c)
dev.off()

#Comments

figl2a <- ggplot(all %>% filter(type == "fake term_limited")) + 
  geom_col(aes(x = weeks, y = avg_comments),
                 fill = "black", alpha = 0.6) +
  theme_bw() +
  labs(x = "Weeks in the Three Months before 2018 Election", 
       y = "Comments on Posts about Fake News") + 
    geom_vline(xintercept = c(40, 43), linetype=c("dotted", "longdash"), 
               color = c("gray", "gray"), size=c(.5, .5))

cairo_ps(filename = "Figures/figl2a.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(figl2a)
dev.off()

figl2b <- ggplot(all %>% filter(type == "issues")) + 
  geom_col(aes(x = weeks, y = avg_comments), 
                 fill = "black", alpha = 0.6) +
  theme_bw() +
  labs(x = "Weeks in the Three Months before 2018 Election", 
       y = "Comments on Posts about Issues") +  
  geom_vline(xintercept = c(40, 43), linetype=c("dotted", "longdash"), 
              color = c("gray", "gray"), size=c(.5, .5))

cairo_ps(filename = "Figures/figl2b.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(figl2b)
dev.off()

figl2c <- ggplot(week_issues_economia) + 
  geom_col(aes(x = weeks, y = avg_comments), 
                 fill = "black", alpha = 0.6) +
  theme_bw() +
  labs(x = "Weeks in the Three Months before 2018 Election", 
       y = "Comments on Posts about Economic Issues") +  
  geom_vline(xintercept = c(40, 43), linetype=c("dotted", "longdash"), 
             color = c("gray", "gray"), size=c(.5, .5))

cairo_ps(filename = "Figures/figl2c.eps",
         width = 7, height = 7, pointsize = 12,
         fallback_resolution = 800)
print(figl2c)
dev.off()


########## Data cited in Appendix

#Our final data set contains 14,358 stories,
dim(dataset_news)

#Analyis of hand coding

#The full data for this section cannot be shared since it contains data from CrowdTangle
# We are only sharing 
hand_issues <- read_excel("Data/crowdtangle-news/PlanilhaBoatosIssuesFakeNews_public.xlsx", sheet = 1)
hand_fake <- read_excel("Data/crowdtangle-news/PlanilhaBoatosIssuesFakeNews_public.xlsx", sheet = 3)

# Out of the 100 posts in our random sample of posts related to fake news,
# 65 were coverage of fake news and 31 were posts of fact checking. In total, 96 were directly
# related to fake news. 
table(hand_fake$CORRECTION_FAKE_NEWS)
table(hand_fake$COVERING)

#We did the same exercise with the sample of posts related to issues
# to determine whether the posts related to coverage of issues (broadly understood). Out
# of the 100 posts in our random sample of posts related to issues, 37% were coverage of
# these issues. The lower ability to detect coverage through these terms probably comes
# from these terms’ usage in multiple contexts and with different meanings. 
table(hand_issues$ISSUE)


#The search term “crime,” in particular, led to many posts about crime coverage in general, but not
# as a policy issue. If we select the term “economia” (46% of posts related to “economia”
# in our sample were related to economic policy coverage), we find similar results.
prop.table(table(hand_issues$crime, hand_issues$ISSUE), 1)
prop.table(table(hand_issues$economia, hand_issues$ISSUE), 1)

